In [1]:
from sqlalchemy import create_engine, MetaData, Table, select
import pandas as pd
import xgboost as xgb
from datetime import datetime, timedelta
import plotly.express as px
import warnings
pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore") # specify to ignore warning messages
In [2]:
# Define the SQL Server connection parameters
server = 'airborneanalytics.cpdhuvhrm3nv.us-east-1.rds.amazonaws.com'
database = 'airborneanalytics'  
username = 'Hidden'
password = 'Hidden'

connection_string = f"mssql+pyodbc://{username}:{password}@{server}/{database}?driver=ODBC+Driver+17+for+SQL+Server"

engine = create_engine(connection_string)
In [3]:
def get_airport_traffic_data(user_origin, sample_data=False):
    
    if sample_data:
        # Load sample data for airport DTW
        daily_ap_cnt_df = pd.read_csv("daily_ap_cnt_df.csv").drop(columns=['Unnamed: 0'])
        daily_ap_cnt_df = daily_ap_cnt_df[(daily_ap_cnt_df['FlightDate'] <= '2019-12-31')]
        holidays_df = pd.read_csv("holidays_df.csv").drop(columns=['Unnamed: 0'])
        holidays_df['Date'] = pd.to_datetime(holidays_df['Date'])
        return daily_ap_cnt_df, holidays_df
    
    # Define the view and table names
    view_name = 'Airport_Traffic_With_Holiday'
    holidays_table_name = 'US_Holidays'

    # Create a connection
    connection = engine.connect()

    # Reflect the view and table structures
    metadata = MetaData()
    view = Table(view_name, metadata, autoload=True, autoload_with=engine)
    holidays_table = Table(holidays_table_name, metadata, autoload=True, autoload_with=engine)

    # Build a select query to return all columns from the view
    select_query = select([view])

    # Check if user_origin is 'ALL', if not, add the 'Airport' filter
    if user_origin != 'ALL':
        select_query = select_query.where(view.c.Airport == user_origin)

    # Execute the query and fetch all rows into a Pandas DataFrame
    daily_ap_cnt_df = pd.read_sql_query(select_query, connection)

    # Build a select query for the 'US_Holidays' table
    holidays_query = select([holidays_table])

    # Execute the query and fetch all rows into a Pandas DataFrame
    holidays_df = pd.read_sql_query(holidays_query, connection)
    holidays_df['Date'] = pd.to_datetime(holidays_df['Date'])
    
#     daily_ap_cnt_df.to_csv('daily_ap_cnt_df.csv')
#     holidays_df.to_csv('holidays_df.csv')

    # Close the connection
    connection.close()

    return daily_ap_cnt_df, holidays_df
In [4]:
# Set list of columns that should be contained in the predicted df
column_list = ['FlightDate', 'holiday_bool','Month_1', 'Month_2',
 'Month_3', 'Month_4', 'Month_5', 'Month_6',
 'Month_7', 'Month_8', 'Month_9', 'Month_10',
 'Month_11', 'Month_12', 'Holiday_4th of July', 'Holiday_Christmas Day',
 'Holiday_Christmas Eve', 'Holiday_Columbus Day', 'Holiday_Eastern Easter',
 'Holiday_Juneteenth', 'Holiday_Labor Day', 'Holiday_Labor Day Weekend',
 'Holiday_Martin Luther King, Jr. Day', 'Holiday_Memorial Day', "Holiday_New Year's Day",
 "Holiday_New Year's Eve", 'Holiday_Thanksgiving Day', 'Holiday_Thanksgiving Eve',
 "Holiday_Valentine's Day", 'Holiday_Veterans Day', "Holiday_Washington's Birthday",
 'Holiday_Western Easter', 'DayOfWeek_0', 'DayOfWeek_1', 'DayOfWeek_2',
 'DayOfWeek_3', 'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6', 'weekofyear_1', 'weekofyear_2', 'weekofyear_3', 'weekofyear_4', 'weekofyear_5', 'weekofyear_6',
'weekofyear_7', 'weekofyear_8', 'weekofyear_9', 'weekofyear_10', 'weekofyear_11', 'weekofyear_12',
'weekofyear_13', 'weekofyear_14', 'weekofyear_15', 'weekofyear_16', 'weekofyear_17', 'weekofyear_18',
'weekofyear_19', 'weekofyear_20', 'weekofyear_21', 'weekofyear_22', 'weekofyear_23', 'weekofyear_24',
'weekofyear_25', 'weekofyear_26', 'weekofyear_27', 'weekofyear_28', 'weekofyear_29', 'weekofyear_30',
'weekofyear_31', 'weekofyear_32', 'weekofyear_33', 'weekofyear_34', 'weekofyear_35', 'weekofyear_36',
'weekofyear_37', 'weekofyear_38', 'weekofyear_39', 'weekofyear_40', 'weekofyear_41', 'weekofyear_42',
'weekofyear_43', 'weekofyear_44', 'weekofyear_45', 'weekofyear_46', 'weekofyear_47', 'weekofyear_48',
'weekofyear_49', 'weekofyear_50', 'weekofyear_51', 'weekofyear_52']

def feature_engineering(df, with_dummies=True):
    # Ensure that the 'FlightDate' column is in datetime format
    df['FlightDate'] = pd.to_datetime(df['FlightDate'])
    
    # Set 'FlightDate' as the index and sort the DataFrame
    df = df.set_index('FlightDate').sort_values(by='FlightDate', ascending=True)

    # Add features
    df['Month'] = df.index.month
    df['DayOfWeek'] = df.index.dayofweek
    df['weekofyear'] = df.index.isocalendar().week
    df['holiday_bool'] = pd.notnull(df['Holiday']).astype(int)
    
    if with_dummies:
        # Perform one-hot encoding using get_dummies
        df = pd.get_dummies(df, columns=['Month', 'Holiday', 'DayOfWeek', 'weekofyear'], 
                           prefix=['Month', 'Holiday', 'DayOfWeek', 'weekofyear'], dtype=int).drop(columns=['Airport'])
        return df
    else:
        return df

def calc_historical_stats(daily_ap_cnt_df_feats):
    # Create a dictionary to map numerical index to the corresponding month
    month_mapping = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
    month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    # Create a dictionary to map numerical index to the corresponding day
    day_mapping = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']


    avg_stats = daily_ap_cnt_df_feats[['Total_AirTraffic_count','Month','DayOfWeek']].reset_index()
    avg_stats_month = avg_stats[['Total_AirTraffic_count','Month']].groupby('Month').mean()#.sort_values(by='Total_AirTraffic_count', ascending=False)

    # Map the 'Month' column using the created dictionary
    avg_stats_month['Month_name'] = avg_stats_month.index.map(month_mapping)

    # Convert the 'Month' column to a Categorical data type with the desired order
    avg_stats_month['Month_name'] = pd.Categorical(avg_stats_month['Month_name'], categories=month_order, ordered=True)

    # Sort the DataFrame based on the categorical order
    avg_stats_month.sort_values('Month', inplace=True)

    avg_stats_DayOfWeek = avg_stats[['Total_AirTraffic_count','DayOfWeek']].groupby('DayOfWeek').mean()#.sort_values(by='Total_AirTraffic_count', ascending=False)
    # Map the 'DayOfWeek' column using the created dictionary
    avg_stats_DayOfWeek['Day'] = avg_stats_DayOfWeek.index.map(day_mapping)
    # Convert the 'Day' column to a Categorical data type with the desired order
    avg_stats_DayOfWeek['Day'] = pd.Categorical(avg_stats_DayOfWeek['Day'], categories=day_order, ordered=True)

    # Sort the DataFrame based on the categorical order
    avg_stats_DayOfWeek.sort_values('Day', inplace=True)
    
    return avg_stats_DayOfWeek, avg_stats_month

def user_prediction(user_date, user_origin):
    # Get historical dataset of airport traffic for the user provided airport
    daily_ap_cnt_df, holidays_df = get_airport_traffic_data(user_origin, sample_data=True)
    daily_ap_cnt_df_feats = feature_engineering(daily_ap_cnt_df)
    daily_ap_cnt_df_feats_no_dum = feature_engineering(daily_ap_cnt_df,with_dummies=False)
    avg_stats_DayOfWeek, avg_stats_month = calc_historical_stats(daily_ap_cnt_df_feats_no_dum)
    
    X_train = daily_ap_cnt_df_feats[column_list[1:]]
    y_train = daily_ap_cnt_df_feats['Total_AirTraffic_count']
    
    # Fit model based on parameters determined from initial analysis
    xgb_model = xgb.XGBRegressor(base_score=0.5, booster='gbtree',    
                       n_estimators=1500,
                       #early_stopping_rounds=50,
                       objective='reg:linear',
                       max_depth=2,
                       learning_rate=0.03,
                       n_jobs=-1,tree_method='hist')
    xgb_model.fit(X_train, y_train, verbose=100)
    
    start_date = datetime.strptime('01-01-2018', '%m-%d-%Y')
    end_date = datetime.strptime('12-31-2019', '%m-%d-%Y')
    
    # Create date range to use in prediction
    user_date = datetime.strptime(user_date, '%m-%d-%Y')
    date_list = [user_date - timedelta(days=i) for i in range(7, -8, -1)]

    # Format the dates as strings and filter based on start_date and end_date
    formatted_date_list = [date.strftime('%m-%d-%Y') for date in date_list if date < start_date or date > end_date]

    # User Query
    user_date_df = pd.DataFrame(columns=column_list)

    if len(formatted_date_list) == 0:
        # If there are no dates in the range, return the specified columns from daily_ap_cnt_df_feats
        final_df = daily_ap_cnt_df_feats[daily_ap_cnt_df_feats.index.isin(date_list)]
        final_df = final_df.rename(columns={'Total_AirTraffic_count': 'Demand'})
        return final_df['Demand'], avg_stats_DayOfWeek, avg_stats_month

    # Iterate over formatted_date_list to create rows in the DataFrame
    for date_str in formatted_date_list:
        row = [date_str] + [0] * (len(column_list) - 1)
        user_date_df = user_date_df.append(pd.Series(row, index=column_list), ignore_index=True)

    # Convert 'FlightDate' to datetime
    user_date_df['FlightDate'] = pd.to_datetime(user_date_df['FlightDate'])

    # Merge holiday dataframe
    user_date_df = user_date_df.merge(holidays_df[['Date', 'Holiday']], how='left', left_on='FlightDate', right_on='Date')
    user_date_df['holiday_bool'] = pd.notnull(user_date_df['Holiday']).astype(int)

    user_date_df = user_date_df.set_index('FlightDate')
    user_date_df['DayOfWeek'] = user_date_df.index.dayofweek
    user_date_df['Month'] = user_date_df.index.month

    # Iterate over all rows in the DataFrame
    for index, row in user_date_df.iterrows():
        # Check if 'holiday_bool' is 1
        if row['holiday_bool'] == 1:
            # Find the corresponding column name in 'Holiday' column
            holiday_name = 'Holiday_' + row['Holiday']

            # Check if the column exists in the DataFrame
            if holiday_name in user_date_df.columns:
                # Update the corresponding column value to 1
                user_date_df.loc[index, holiday_name] = 1

        # DoM = 'DayofMonth_' + str(row['DayofMonth'])
        DoW = 'DayOfWeek_' + str(row['DayOfWeek'])
        mon = 'Month_' + str(row['Month'])

        # Loop through DataFrame columns and update values
        for col in user_date_df.columns:
            # if 'DayofMonth_' in col and col == DoM:
            #     user_date_df.loc[index, col] = 1
            if 'DayOfWeek_' in col and col == DoW:
                user_date_df.loc[index, col] = 1
            elif 'Month_' in col and col == mon:
                user_date_df.loc[index, col] = 1

    # Drop unnecessary columns
    user_date_df = user_date_df.drop(columns=['Date', 'Holiday', 'DayOfWeek', 'Month']).astype(int)
    user_pred = xgb_model.predict(user_date_df[column_list[1:]])

    # Add predictions to the DataFrame
    user_date_df['Predicted_Demand'] = user_pred

    # Check if the index of user_date_df is in the index of daily_ap_cnt_df_feats
    if user_date_df.index.isin(daily_ap_cnt_df_feats.index).all():
        # If so, update the 'Prediction' column in daily_ap_cnt_df_feats
        daily_ap_cnt_df_feats['Predicted_Demand'].loc[user_date_df.index] = user_date_df['Predicted_Demand']
    else:
        # If not, add new rows to daily_ap_cnt_df_feats
        daily_ap_cnt_df_feats = pd.concat([daily_ap_cnt_df_feats, user_date_df[['Predicted_Demand']]])

    final_df = daily_ap_cnt_df_feats[daily_ap_cnt_df_feats.index.isin(date_list)]
    final_df = final_df.rename(columns={'Total_AirTraffic_count': 'Actual_Demand'})
    final_df['Demand'] = final_df['Actual_Demand'].where(final_df['Actual_Demand'].notna(), final_df['Predicted_Demand'])
    final_df['Demand'] = final_df['Demand'].round(decimals=0)

    return final_df[['Demand']], avg_stats_DayOfWeek, avg_stats_month

User Inputs and Model Outputs¶

In [5]:
user_date = '11-29-2020'
user_origin = 'DTW'
user_dest = 'JFK' # Only needed for route plotting

# If the dates +/- 7 days from user date are included in the training data, then actual demand is returned,
# otherwise the predicted value is returned.

pred_df, avg_stats_DayOfWeek, avg_stats_month = user_prediction(user_date, user_origin)

Example Plots¶

In [6]:
user_date_dt = pd.to_datetime(user_date)
# Create the line chart for predicted data
fig_pred_day = px.line(pred_df, x=pred_df.index, y='Demand', title=f"Predicted Inbound/Outbound Flights at {user_origin} on {user_date}")

# Highlight the entire day corresponding to user_date in red
fig_pred_day.add_shape(
    dict(
        type="rect",
        xref="x",
        yref="paper",
        x0=user_date_dt,
        y0=0,
        x1=user_date_dt,
        y1=1,
        fillcolor="red",  # Use red instead of blue
        opacity=0.2,
        layer="below",
        line=dict(color="red", width=60)  # Use red instead of blue
    )
)

# Center the title
fig_pred_day.update_layout(title_x=0.5)
# Set the height and width of the plot
fig_pred_day.update_layout(height=300, width=980)
fig_pred_day.update_yaxes(title_text="Total Traffic")

# Show the plot
fig_pred_day.show()

# Assuming 'user_date' is a datetime object
user_month = user_date_dt.strftime('%B')

# Create the line chart
fig = px.line(avg_stats_month, x='Month_name', y='Total_AirTraffic_count', title=f"Average Traffic at {user_origin} by Month")

# Highlight the month corresponding to user_date
fig.add_shape(
    dict(
        type="rect",
        xref="x",
        yref="paper",
        x0=user_month,
        y0=0,
        x1=user_month,
        y1=1,
        fillcolor="red",
        opacity=0.2,
        layer="below",
        line=dict(color="red", width=60)
    )
)

# Center the title
fig.update_layout(title_x=0.5)
fig.update_layout(height=300, width=980)
fig.update_yaxes(title_text="Total Traffic")
fig.update_xaxes(title_text="Month")
# Show the plot
fig.show()

# Assuming 'user_date' is a datetime object
user_day = user_date_dt.strftime('%A')

# Create the line chart
fig_day = px.line(avg_stats_DayOfWeek, x='Day', y='Total_AirTraffic_count', title=f"Average Traffic at {user_origin} by Day of Week")

# Highlight the entire day corresponding to user_date
fig_day.add_shape(
    dict(
        type="rect",
        xref="x",
        yref="paper",
        x0=user_day,
        y0=0,
        x1=user_day,
        y1=1,
        fillcolor="red",
        opacity=0.2,
        layer="below",
        line=dict(color="red", width=60)
    )
)

# Center the title
fig_day.update_layout(title_x=0.5)
fig_day.update_layout(height=300, width=980)
fig_day.update_yaxes(title_text="Total Traffic")
# Show the plot
fig_day.show()